Generating Figure 3

Author
Published

2026-01-08

Setup

Code
rpkg <- c("readr tidyverse fs stringr ggplot2 dplyr conflicted piggyback tidyr tibble downloadthis ggpubr")


import_pkg <- function(x)
  x |> trimws() |> strsplit("\\s+")  |> unlist() |> 
  lapply(function(x) library(x, character.only = T)) |> 
  invisible()

rpkg |> import_pkg()

# Resolve conflicted functions.
conflicted::conflict_prefer(name = "filter", winner = "dplyr",losers = "stats")

Multiple csv files

We begin by loading in the Jaccard .csv files we generated for each interval set in BeforeAfter.R. These files are also readily available for you in the Github Repository under the “Difference” folder.

To calculate the error and in Figure 3, we get the data from our calculations BeforeAfter as well, which is also readily available for you in the folder “avg_diff”.

Read multiple csv files in and organize them using fs to read in all the .csv files at once, as well as purrr and dplyr from tidyverse to organize them. We will do this for the main dataset and also for the grey error band we calculated earlier.

Code
# For main dataset
data_dir <- "~/Documents/BioHom/Difference/" #setup directory of where we want to read the multiple files. This is simply folder of the sumRes_03 dataframe for each interval-set we've calculated, all collated into one folder. 

#For error band
data_diff <- "~/Documents/BioHom/avg_diff/"

# Test to be sure this works by loading in all. csv files:
csv_files <- fs::dir_ls(data_dir, regexp = "\\.csv$")  # use fs to collect all data in data_dir that ends in .csv
csv_files_diff <- fs::dir_ls(data_diff, regexp = "\\.csv$") # for error

# Now load in the .csv files and ID each .csv to keep tabs:
all_Jacc <- data_dir |> 
  dir_ls(regexp = "\\.csv$") |>   # read in all files in the directory that end in .csv
  map_dfr(read_csv, .id = "source")   # id everything by source
#for errors:
all_errors <- data_diff |> 
  dir_ls(regexp = "\\.csv$") |>   # read in all files in the directory that end in .csv
  map_dfr(read_csv, .id = "source")   # id everything by source

# Set up better ID's:
all_Jacc$ID <- gsub(".*\\/([^\\/]*)(Kpg|LOI|PT|TJ|lome|Messi|SanCamp|AlbCen|Pliens|Assel|Serra).*", "\\2", all_Jacc$source)
# ^ add a new column with ID's based on all_Jacc$source, removing all but the interval-pair name from it
# Set up better ID's:
all_errors$ID <- gsub(".*\\/([^\\/]*)(Alb|Assel|Camp|Ceno|Cha|Dan|Dar|Het|Het|Hir|Ind|Kat|Maa|Messi|Ole|Plie|Rhae|Rhud|Sak|San|Toa|Zanc).*", "\\2", all_errors$source)

# Rearrange data for clarity using dplyr's arrange():
all_Jacc <- all_Jacc |>   arrange(ID, cutdist)
all_errors <- all_errors|>   arrange(ID, cutdist)

all_Jacc$source <- NULL # remove source column now that it is no longer needed
all_errors$source <- NULL # remove source column now that it is no longer needed

all_Jacc$...1 <- NULL # remove this weird column that was added in with source as well
all_errors$...1 <- NULL # remove this weird column that was added in with source as well

all_Jacc$age <- NULL # remove this weird column that was added in with source as well

# Separate the LOME and PT data for later:

# After
LOME_Jacc <- all_Jacc[all_Jacc$label == 'After' & all_Jacc$ID == 'lome', ]
PT_Jacc   <- all_Jacc[all_Jacc$label == 'After' & all_Jacc$ID == 'PT', ]

# After-2
LOME2_Jacc <- all_Jacc[all_Jacc$label == 'After-2' & all_Jacc$ID == 'lome', ]
PT2_Jacc   <- all_Jacc[all_Jacc$label == 'After-2' & all_Jacc$ID == 'PT', ]

# Add in missing cutdist values:
missing_LOME <- data.frame(cutdist=c(18000, 20000),
                           avg= c(NA, NA), sdev= c(NA, NA), 
                           n= c(NA,NA), se= c(NA,NA), first= c(NA,NA), 
                           second= c(NA, NA), label = c('After-2','After-2'),
                           ci = c(NA, NA),
                           ID = c('lome','lome'), Jaccard_change=c(NA,NA))

LOME_Jacc <-rbind(LOME_Jacc, missing_LOME)
LOME2_Jacc <- rbind(LOME2_Jacc, missing_LOME)

# Add in missing cutdist values:
missing_PT <- data.frame(cutdist=c(18000, 18000,20000, 20000),
                           avg= c(NA, NA, NA, NA),
                          sdev= c(NA, NA, NA, NA),
                           n=   c(NA, NA, NA, NA),
                           se=  c(NA, NA, NA, NA),
                         first= c(NA, NA, NA, NA),
                        second= c(NA, NA, NA, NA),
                        label = c('After','After-2', 'After', 'After-2'),
                           ci = c(NA, NA, NA, NA),
                           ID = c('PT','PT','PT', 'PT'),
                        Jaccard_change=c(NA,NA,NA,NA))

PT2_Jacc <- rbind(PT2_Jacc, missing_PT)

LOME2_Jacc$ID <- 'LOME2' # Give it a unique label
PT2_Jacc$ID <- 'PT2'

LOME_Jacc <- rbind(LOME_Jacc, LOME2_Jacc) # Bring After and After-2 together
PT_Jacc <- rbind(PT_Jacc, PT2_Jacc)

Before and after columns

All interval pairs need to have columns from 0 -20000 in order to perform the calculations, so those with missing columns are filled in with NA data. We are separating by the labels Before and After. The goal is to create two new columns, one ‘ave_before’ and one ‘ave_after’ so we can subtract one from the other and calculate the change in similarity:

Code
# Using dplyr, I add each missing row into a new df: 
missing <- data.frame(cutdist=c(20000, 20000, # AlbCen
                                20000, 20000,# Assel
                                20000, 20000, # LOI
                                18000, 20000, # lome
                                20000, 20000, # messi
                                20000, 20000, # pliens
                                18000, 18000, 20000, 20000, 20000, # PT
                                20000, 20000, #SanCamp
                                18000, 20000,  #Serra
                                20000), #TJ
                     avg=    c(NA, NA,
                               NA, NA,
                               NA, NA,
                               NA, NA, 
                               NA, NA,
                               NA, NA, 
                               NA, NA, NA, NA, NA,
                               NA, NA,
                               NA, NA, 
                               NA),
                     sdev=   c(NA, NA, 
                               NA, NA,
                               NA, NA,
                               NA, NA, 
                               NA, NA, 
                               NA, NA, 
                               NA, NA, NA, NA, NA,
                               NA, NA,
                               NA, NA, 
                               NA),
                     n=      c(NA, NA, 
                               NA, NA,
                               NA, NA,
                               NA, NA, 
                               NA, NA, 
                               NA, NA,
                               NA, NA, NA, NA, NA,
                               NA, NA,
                               NA, NA, 
                               NA),
                     se=     c(NA, NA, 
                               NA, NA,
                               NA, NA,
                               NA, NA, 
                               NA, NA, 
                               NA, NA,
                               NA, NA, NA, NA, NA, 
                               NA, NA,
                               NA, NA, 
                               NA),
                     first=  c(NA, NA,
                               NA, NA, 
                               NA, NA,
                               NA, NA,  
                               NA, NA, 
                               NA, NA, 
                               NA, NA, NA, NA, NA,
                               NA, NA,
                               NA, NA, 
                               NA), 
                     second= c(NA, NA, NA,
                               NA,
                               NA, NA, 
                               NA, NA, 
                               NA, NA,
                               NA, NA, 
                               NA, NA, NA, NA, NA, 
                               NA, NA,
                               NA, NA, 
                               NA),
                     label=  c('Before', 'After',
                               'Before', 'After',
                               'Before', 'After',
                               'After-2', 'After-2', 
                               'Before','After', 
                               'Before','After',
                               'After', 'After-2', 'Before', "After", "After-2",
                               'Before','After',
                               'Before', 'After',
                               'After'),
                     ci=    c(NA, NA,
                              NA, NA,
                              NA, NA,
                              NA, NA, 
                              NA, NA, 
                              NA, NA,
                              NA, NA, NA, NA, NA, 
                              NA,NA,
                              NA, NA, 
                              NA),
                     ID =   c('AlbCen', 'AlbCen',
                              'Assel', 'Assel',
                              'LOI', 'LOI',
                              'lome', 'lome',
                              'Messi','Messi',
                              'Pliens', 'Pliens', 
                              'PT', 'PT', 'PT', 'PT', 'PT',
                              'SanCamp','SanCamp',
                              'Serra', 'Serra',
                              'TJ'),
                     Jaccard_change= c(NA, NA,
                              NA, NA,
                              NA, NA,
                              NA, NA, 
                              NA, NA, 
                              NA, NA,
                              NA, NA, NA, NA, NA, 
                              NA,NA,
                              NA, NA, 
                              NA))

# Add these missing rows to the all_Jacc df
all_Jacc <- rbind(all_Jacc, missing)

# Rearrange all_Jacc by ID and cutDist and label
all_Jacc <- all_Jacc |> 
            arrange(ID, cutdist, label)

Difference

Separate the data to separate before and after columns so we can subtract them:

Code
# Remove After-2
all_Jacc2 <- all_Jacc |> 
  filter(!label == 'After-2')

# Before: 
before_data2 = 
  all_Jacc2 |>
  filter(label == 'Before') |> 
  group_by(cutdist,ID) |>
  mutate(ave_before = avg) |> 
  mutate(ave_first_b= first) |> 
  mutate(ave_second_b = second) |>
  select(c(ave_before ))

# After:
after_data2 = 
  all_Jacc2 |>
  filter(label == 'After') |> 
  group_by(cutdist,ID) |>
  mutate(ave_after = avg) |>
    mutate(ave_first= first) |> 
  mutate(ave_second = second) |> #change the name to avg 
  select(c(ave_after)) # only these columns are selected

before_data2 <- na.omit(before_data2)
after_data2 <- na.omit(after_data2)

# Add the after_data to before_data:
all_Jacc_diff2 <- left_join(after_data2, before_data2, by=c("cutdist", "ID"))

# Calculate the differences for avg:
all_Jacc_diff<- all_Jacc_diff2 |> 
  mutate(Tavg_diff = ave_after - ave_before )  

The interval-trio problem

Since the LOME and PT have three intervals that we are analyzing, we need to treat it differently so we can calculate the difference. We will do much of what was done in the previous chunk to After-2 and apply it to find the difference between that and the After column.

Code
# Before: 
before_LOME = 
  LOME_Jacc |>
  filter(label == 'After') |> 
  group_by(cutdist,ID) |>
 mutate(ave_before = avg) |>
 #change the name to avg 
  select(ave_before)

before_PT = 
  PT_Jacc |>
  filter(label == 'After') |> 
  group_by(cutdist,ID) |>
 mutate(ave_before = avg) |>
 select(ave_before)

# After:
after_LOME = 
  LOME2_Jacc |>
  group_by(cutdist,ID) |>
  filter(label == 'After-2') |> 
 mutate(ave_after = avg) |>
 #change the name to avg 
    ungroup() |> 
  select(ave_after) # only this column is selected

after_PT = 
  PT_Jacc |>
  group_by(cutdist,ID) |>
  filter(label == 'After-2') |> 
 mutate(ave_after = avg) |>
ungroup() |> 
  select(ave_after) # only this column is selected

# Add the after_data to before_data:
LOME_Jacc <- cbind(before_LOME,after_LOME)
LOME_Jacc$ID <- 'LOME2'

PT_Jacc <- cbind(before_PT, after_PT)
PT_Jacc$ID <- 'PT2'

# Calculate the differences for avg:
LOME_Jacc<- LOME_Jacc |> 
  mutate(Tavg_diff = ave_after - ave_before ) 


Jacc <- rbind(all_Jacc_diff,LOME_Jacc)

PT_Jacc <- PT_Jacc |> 
  mutate(Tavg_diff = ave_after - ave_before ) 

Jacc <- rbind(Jacc,PT_Jacc)


# Add a label to determine whether an interval is a mass extinction or background interval:

Jacc$mass <- NA

#Using stringr's str_detect and dplyr's mutate, we can do this:
Jacc <- Jacc |> 
  mutate(mass = case_when(str_detect(ID, "(lome|LOME2|PT|PT2|TJ|Kpg)$")~ "mass", TRUE ~ "background")) #all else are labeled background

Now, here is the final dataset with all interval pairs and trios!

Code
Jacc
# A tibble: 133 × 6
# Groups:   cutdist, ID [133]
   cutdist ID     ave_after ave_before Tavg_diff mass      
     <dbl> <chr>      <dbl>      <dbl>     <dbl> <chr>     
 1       0 AlbCen    0.0571     0.0417  0.0153   background
 2    2000 AlbCen    0.0345     0.0222  0.0123   background
 3    4000 AlbCen    0.0416     0.0435 -0.00194  background
 4    6000 AlbCen    0.0240     0.0338 -0.00981  background
 5    8000 AlbCen    0.0247     0.0258 -0.00110  background
 6   10000 AlbCen    0.0210     0.0292 -0.00816  background
 7   12000 AlbCen    0.0181     0.0241 -0.00595  background
 8   14000 AlbCen    0.0246     0.0250 -0.000375 background
 9   16000 AlbCen    0.0235     0.0212  0.00230  background
10   18000 AlbCen    0.0112     0.0771 -0.0659   background
# ℹ 123 more rows

Finding the “no-change” zone

For this particular set of plots, we are going to want the dataframes of when we compare each age to itself (Changhsingian-to-Changhsingian, for example) instead of comparing before and after. Luckily, we already calculated that in the previous set of codes (before_after.R). From that code, we are interested in looking at the dataframes for each age that stored all our calculated similarity values for each iteration of our subsamples per age. We loaded that dataset in earlier in this script and labeled it all_errors, which gives the age as well as the maximumum and minimum values of those differences.

Code
all_errors
# A tibble: 226 × 8
   cutdist  mean_diff      sd     n       se lower_95 upper_95 ID   
     <dbl>      <dbl>   <dbl> <dbl>    <dbl>    <dbl>    <dbl> <chr>
 1       0 -0.00132   0.00751    99 0.000755  -0.0160  0.0134  Alb  
 2    2000 -0.000565  0.00492    99 0.000495  -0.0102  0.00908 Alb  
 3    4000 -0.000692  0.00844    99 0.000848  -0.0172  0.0159  Alb  
 4    6000  0.000185  0.00661    99 0.000664  -0.0128  0.0131  Alb  
 5    8000 -0.000686  0.00532    99 0.000535  -0.0111  0.00975 Alb  
 6   10000  0.000160  0.00581    99 0.000584  -0.0112  0.0115  Alb  
 7   12000 -0.000378  0.00548    99 0.000551  -0.0111  0.0104  Alb  
 8   14000 -0.0000350 0.00629    99 0.000632  -0.0124  0.0123  Alb  
 9   16000 -0.000755  0.00853    99 0.000857  -0.0175  0.0160  Alb  
10   18000 -0.00788   0.0802     98 0.00810   -0.165   0.149   Alb  
# ℹ 216 more rows

To calculate the “band of no-change”, We want to use the average of the upper and lower 95% confidence intervals for each distance bin (labeled as cut_dist here) across all the ages.

Code
# Upper 95 confidence interval:
ave_up95ci <- all_errors |>
  group_by(cutdist) |>
  summarise(upci = mean(upper_95, na.rm = TRUE)) |> 
  ungroup()

# Lowe 95 confidence interval:
ave_low95ci <- all_errors |>
  group_by(cutdist) |>
  summarise(lowci = mean(lower_95, na.rm = TRUE)) |> 
  ungroup()

ciband <- left_join(ave_up95ci,ave_low95ci, by = "cutdist")
ciband$cutdist[is.na(ciband$cutdist)] <- 20000



ciband
# A tibble: 11 × 3
   cutdist   upci   lowci
     <dbl>  <dbl>   <dbl>
 1       0 0.0377 -0.0382
 2    2000 0.0260 -0.0263
 3    4000 0.0273 -0.0271
 4    6000 0.0219 -0.0222
 5    8000 0.0258 -0.0261
 6   10000 0.0286 -0.0277
 7   12000 0.0297 -0.0283
 8   14000 0.0323 -0.0317
 9   16000 0.0417 -0.0398
10   18000 0.0598 -0.0592
11   20000 0.0622 -0.0772

Plot it out

Here we create two plots to examine the change in similarity for each interval pair (or trio), as either a positive change representing trends towards homogenization, or a negative change representing change towards provincialization. The first plot is for background events and for intervals that directly succeed mass extinction events, so the Olenekian age which does not directly succeed the end-Permian mass extinction event is not included. The second plot investigates change in similarity across three subsequent ages for each of the Late Ordovician Mass Extinction (LOME) and for the end-Permian mass extinction events.

Code
# background versus mass extinctions plot:
all_Jacc_noOlen <- filter(Jacc, !(ID =="PT2" ))

Plot.allJacc <-  all_Jacc_noOlen |> 
  ggplot(aes(x = cutdist, 
             y = Tavg_diff,
             group = ID, 
             colour = mass)) +     
  geom_line(linewidth = 1.2, position = position_dodge(width = 0.3)) + 
  ylim(-0.25, 0.25) +
  geom_hline(yintercept = 0, linewidth = 0.5) + 
  scale_color_manual(values = c("grey50", "#CC0033")) +

  labs(
    x = "Great Circle Distance (km)",
    y = "Change in Similarity", 
    colour = "Extinction type"
  ) +
  theme_light() +
  geom_ribbon(
    data = ciband, 
    aes(x = cutdist, ymin = lowci, ymax = upci), 
    fill = "grey50", 
    alpha = 0.3, 
    inherit.aes = FALSE, #use this or there will be an error
  )+
  theme(aspect.ratio = 1)
Plot.allJacc

Code
ggsave("plot.allJacc.pdf", plot = Plot.allJacc, width = 10, height =6 , units = "in", dpi = 300)

all_Jacc3 <- Jacc |>   filter(ID %in% c("PT", "lome", "LOME2", "PT2"))


# LOME and PT interval-trio plot: 
Plot.Jacc3 <- all_Jacc3 |> 
  ggplot(aes(x = cutdist, 
             y = Tavg_diff,
             group = ID, 
             colour = ID)) +     
  geom_line(linewidth = 1.2, position = position_dodge(width = 0.3)) + ylim(-0.25, 0.25) +
  geom_hline(yintercept=0, linewidth=0.5) + 
scale_color_manual(values = c( "chocolate4","tan2", "dodgerblue2","skyblue2")) +
   geom_ribbon(
    data = ciband, 
    aes(x = cutdist, ymin = lowci, ymax = upci), 
    fill = "grey50", 
    alpha = 0.3, 
    inherit.aes = FALSE
  )+
labs(x = "Great Circle Distance (km)",
     y = "Change in Similarity", 
     colour = "Extinction type") +
  theme_light() +
  theme(plot.title = element_text(face = "bold"),
        axis.title = element_text(face = "bold"),
        legend.title = element_text(face = "bold"),
        aspect.ratio = 1)
Plot.Jacc3

Code
ggsave("plot.Jaccwaste-pt.pdf", plot = Plot.Jacc3, width = 10, height =6 , units = "in", dpi = 300)